Executing R from Jupyter notebook with Python kernel!¶

In [ ]:

% load_ext rpy2.ipython
import rpy2.robjects as robj

In [37]:

%%R
N_iter=20
y=rep(NA,N_iter)

set.seed(123)

for(i in 1:N_iter){
cat(i,"\r")
rands=rnorm(2^i)
y[i]=max(rands)
}

In [38]:

%%R

plot(1:N_iter, y, type="l")

In [36]:

%%R
grades=read.csv("grades.csv")
grades$X=NULL
head(grades)

meangrades=c()

for (i in 1:nrow(grades)){
meangrades[i]=mean(as.matrix(grades[i,2:ncol(grades)]))
}

head(meangrades)

[1] 49.25 59.00 44.00 50.00 55.75 56.75

In [17]:

%%R
meangrades = apply (grades[,2:ncol(grades)],1,mean)
head(meangrades)

[1] 49.25 59.00 44.00 50.00 55.75 56.75

Passing objects from Python to R and vice versa¶

In [186]:

import rpy2.robjects as robj
from rpy2.robjects import r # R instance
import numpy as np
import pandas as pd

In [32]:

r["meangrades"]

Out[32]:

FloatVector with 200 elements.

49.250000

59.000000

44.000000

50.000000

...

41.500000

51.250000

59.750000

61.000000

In [33]:

type(r["meangrades"])

Out[33]:

rpy2.robjects.vectors.FloatVector

In [42]:

meangrades_np = np.array(r["meangrades"])
meangrades_np[:10]

Out[42]:

array([49.25, 59.  , 44.  , 50.  , 55.75, 56.75, 53.75, 41.5 , 55.  ,
       52.  ])

To pass Python object to R use constructors¶

In [62]:

r["meangrades_back"]

---------------------------------------------------------------------------
LookupError                               Traceback (most recent call last)
<ipython-input-62-d770a250cbb4> in <module>()
----> 1 r["meangrades_back"]

/usr/local/lib/python3.6/dist-packages/rpy2/robjects/__init__.py in __getitem__(self, item)
    329 
    330     def __getitem__(self, item):
--> 331         res = _globalenv.get(item)
    332         res = conversion.ri2py(res)
    333         if hasattr(res, '__rname__'):

LookupError: 'meangrades_back' not found

In [67]:

meangrades_back = robj.Vector(meangrades_np)
_ = r(f"meangrades_back = {meangrades_back.r_repr()}")
r["meangrades_back"][:10]

Out[67]:

array([49.25, 59.  , 44.  , 50.  , 55.75, 56.75, 53.75, 41.5 , 55.  ,
       52.  ])

Pandas has native binding¶

In [51]:

from rpy2.robjects import pandas2ri
pandas2ri.activate()

In [54]:

grades = pd.read_csv("grades.csv", index_col=0)
grades.head()

Out[54]:

	id	write	math	science	socst
1	70	52	41	47	57
2	121	59	53	63	61
3	86	33	54	58	31
4	141	44	47	53	56
5	172	52	57	53	61

In [57]:

r_dataframe = pandas2ri.py2ri(grades)
_ = r(f"grades_back = {r_dataframe.r_repr()}")

In [60]:

r["grades_back"].head()

Out[60]:

	id	write	math	science	socst
0	70	52	41	47	57
1	121	59	53	63	61
2	86	33	54	58	31
3	141	44	47	53	56
4	172	52	57	53	61

R if¶

In [187]:

%%R
grades$Filt=NA
head(grades)

   id write math science socst Filt
1  70    52   41      47    57   NA
2 121    59   53      63    61   NA
3  86    33   54      58    31   NA
4 141    44   47      53    56   NA
5 172    52   57      53    61   NA
6 113    52   51      63    61   NA

In [68]:

%%R
for (i in 1:nrow(grades)){
if (grades$write[i] > 50){
grades$Filt[i] = "A"
}
else if (grades$write[i] > 40 & grades$write[i] <= 50){
grades$Filt[i] = "B"
}
else {
grades$Filt[i] = "C"
}
}

head(grades)

   id write math science socst Filt
1  70    52   41      47    57    A
2 121    59   53      63    61    A
3  86    33   54      58    31    C
4 141    44   47      53    56    B
5 172    52   57      53    61    A
6 113    52   51      63    61    A

In [70]:

%%R
x = c(-1,4,-5,2,7)
x

ifelse(x > 0, "pos","neg")

[1] "neg" "pos" "neg" "pos" "pos"

R functions¶

In [135]:

%%R
My_Func = function (x, y=2){
x ^ y
}

In [137]:

%%R
My_Func(x)

[1]  1 16 25  4 49

In [138]:

%%R
My_Func(2)

[1] 4

In [139]:

%%R
My_Func(2,4)

[1] 16

Lapply and sapply¶

In [140]:

%%R
l = list(1, c(1,2,3), c(3,4))
l

[[1]]
[1] 1

[[2]]
[1] 1 2 3

[[3]]
[1] 3 4

In [141]:

%%R
lapply(l, sum)

[[1]]
[1] 1

[[2]]
[1] 6

[[3]]
[1] 7

In [142]:

%%R
sapply(l,sum)

[1] 1 6 7

In [143]:

%%R
lapply(l, function(x){return(c(min(x), max(x)))})

[[1]]
[1] 1 1

[[2]]
[1] 1 3

[[3]]
[1] 3 4

In [144]:

%%R
sapply(l, function(x){return(c(min(x), max(x)))})

     [,1] [,2] [,3]
[1,]    1    1    3
[2,]    1    3    4

Installing and importing packages in Rpy2¶

In [75]:

# import rpy2's package module
import rpy2.robjects.packages as rpackages

# import R's utility package
utils = rpackages.importr('utils')

# select a mirror for R packages
utils.chooseCRANmirror(ind=1) # select the first mirror in the list

Out[75]:

rpy2.rinterface.NULL

In [77]:

# R package names
packnames = ('reshape2', 'ggplot2')

# R vector of strings
from rpy2.robjects.vectors import StrVector

# Selectively install what needs to be install.
# We are fancy, just because we can.
names_to_install = [x for x in packnames if not rpackages.isinstalled(x)]
if len(names_to_install) > 0:
    utils.install_packages(StrVector(names_to_install))

In [80]:

reshape = rpackages.importr('reshape2')

R melt and cast - helpful for subsequent plotting and modelling¶

title

In [110]:

%%R
a=data.frame(name=c('John', 'Mary', 'Peter', 'Susan'),
sex=c('m','f','m','f'),
age=c(26,21,19,29),
weight=c(82, 56, 79, 60),
height=c(182, 171, 179, 175))
a

   name sex age weight height
1  John   m  26     82    182
2  Mary   f  21     56    171
3 Peter   m  19     79    179
4 Susan   f  29     60    175

In [111]:

%%R
a_melt = melt (a, id.vars = c('name', 'sex'), 
               variable_name = "a_var", value.name = 'a_name')
a_melt

    name sex variable a_name
1   John   m      age     26
2   Mary   f      age     21
3  Peter   m      age     19
4  Susan   f      age     29
5   John   m   weight     82
6   Mary   f   weight     56
7  Peter   m   weight     79
8  Susan   f   weight     60
9   John   m   height    182
10  Mary   f   height    171
11 Peter   m   height    179
12 Susan   f   height    175

Cast == invert melt¶

In [113]:

%%R
dcast(a_melt, name ~ a_var)

dcast(a_melt, name + sex ~ a_var)

Error in FUN(X[[i]], ...) : object 'a_var' not found

Juggling data using Pandas¶

Melt and pivot¶

In [91]:

df = r["a"]
type(df) # pandas2ri converted it automatically

Out[91]:

pandas.core.frame.DataFrame

In [92]:

df

Out[92]:

	name	sex	age	weight	height
0	John	m	26.0	82.0	182.0
1	Mary	f	21.0	56.0	171.0
2	Peter	m	19.0	79.0	179.0
3	Susan	f	29.0	60.0	175.0

In [121]:

molten = df.melt(id_vars=["name", "sex"], 
                 var_name="a_var", value_name="a_name")
molten

Out[121]:

	name	sex	a_var	a_name
0	John	m	age	26.0
1	Mary	f	age	21.0
2	Peter	m	age	19.0
3	Susan	f	age	29.0
4	John	m	weight	82.0
5	Mary	f	weight	56.0
6	Peter	m	weight	79.0
7	Susan	f	weight	60.0
8	John	m	height	182.0
9	Mary	f	height	171.0
10	Peter	m	height	179.0
11	Susan	f	height	175.0

In [133]:

molten.pivot(columns="a_var", values="a_name", index="name")

Out[133]:

a_var	age	height	weight
name
John	26.0	182.0	82.0
Mary	21.0	171.0	56.0
Peter	19.0	179.0	79.0
Susan	29.0	175.0	60.0

GroupBy object in Pandas¶

In [146]:

trials = pd.read_hdf("nrdd_rephub_targets.hdf")
trials.sample(3)

Out[146]:

	Name	Indication	Phase	Therapeutic categories	rdkit_smiles	Targets	CID
5442	edaglitazone	non-insulin dependent diabetes	discontinued	{ENDOCRINE DRUGS}	Cc1oc(-c2ccccc2)nc1CCOc1ccc(CC2SC(=O)NC2=O)c2s...	NaN	9825701
7117	modafinil	shift work disorder (swd)	pre-registration and above	{NEUROLOGIC DRUGS, PSYCHOPHARMACOLOGIC DRUGS, ...	NC(=O)CS(=O)C(c1ccccc1)c1ccccc1	{CYP2D6, CYP2C19, CYP3A4, Slc6a3, PTGS2, SLC6A...	4236
2600	il-16	asthma	discontinued	{ANTIINFECTIVE THERAPY}	NN1C(=O)CC(c2cccc(Br)c2)C1=O	NaN	125225

title

In [148]:

trials_gby = trials.groupby("Phase")

Out[148]:

<pandas.core.groupby.DataFrameGroupBy object at 0x7fb19d436c88>

In [165]:

sizes = trials_gby.size().sort_values()
sizes

Out[165]:

Phase
phase 1 clinical               123
discovery                      135
phase 3 clinical               208
phase 2 clinical               421
pre-registration and above    2961
discontinued                  3434
dtype: int64

In [191]:

import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="whitegrid", font_scale=1.6)

In [192]:

sns.barplot(sizes.index, sizes.values)
plt.xticks(rotation=45)

Out[192]:

(array([0, 1, 2, 3, 4, 5]), <a list of 6 Text xticklabel objects>)

.apply - get one value for each group¶

In [173]:

def size_apply(df):
    return len(df)

In [174]:

trials_gby.apply(size_apply)

Out[174]:

Phase
discontinued                  3434
discovery                      135
phase 1 clinical               123
phase 2 clinical               421
phase 3 clinical               208
pre-registration and above    2961
dtype: int64

.agg - get one value for each column in each group¶

In [179]:

def size_agg(ser):
    return len(ser)

In [180]:

trials_gby.agg(size_agg)

Out[180]:

	Name	Indication	Therapeutic categories	rdkit_smiles	Targets	CID
Phase
discontinued	3434	3434	3434	3434	3434	3434
discovery	135	135	135	135	135	135
phase 1 clinical	123	123	123	123	123	123
phase 2 clinical	421	421	421	421	421	421
phase 3 clinical	208	208	208	208	208	208
pre-registration and above	2961	2961	2961	2961	2961	2961

In [183]:

from random import choice
grades = pd.read_csv("grades.csv", index_col=0)
grades["favourite_color"] = [choice(["blue", "red", 
                                    "green", "hazelnut"])
                             for _ in grades.index]

In [185]:

grades_gby = grades.groupby("favourite_color")
grades_gby.agg(np.mean)

Out[185]:

	id	write	math	science	socst
favourite_color
blue	98.294118	50.784314	51.529412	49.294118	52.490196
green	100.810345	54.586207	53.931034	52.137931	52.603448
hazelnut	105.794872	51.128205	51.897436	53.282051	51.589744
red	98.346154	53.942308	52.865385	52.961538	52.711538